First, we need some sourc code we want to analyze.
Be sure to format the code to the same style before you repeat this analysis. Otherwise, the results can be irritating.
In [ ]:
In [91]:
import glob
import pandas as pd
import os
DIR = "../../dropover"
#DIR = "../../intellij-community/"
TEXT = " "
MULTIPLIER = 4
filelist = pd.DataFrame(glob.glob(DIR + "/**/*.java", recursive=True), columns=["abspath"])
filelist['path'] = filelist['abspath'].str.replace("../../dropover", "").str.replace("\\", "/")
filelist['abspath'] = filelist['abspath'].apply(os.path.abspath)
filelist.head()
Out[91]:
In [92]:
indentations = 0
lines = 0
pattern=TEXT * MULTIPLIER
li = 0
def count_measures(abspath):
indentations = 0
lines = 0
try:
with open(abspath, encoding='utf-8') as f:
for line in f.readlines():
indentations = indentations + line.count(pattern)
lines = lines + 1
except UnicodeDecodeError as e:
print(abspath, e)
except PermissionError as e:
print(abspath, e)
return pd.Series([indentations, lines])
filelist[['indentations', 'lines']]= filelist['abspath'].apply(count_measures)
filelist.head()
Out[92]:
In [93]:
%matplotlib inline
filelist[['indentations', 'lines']].plot(logy=True)
Out[93]:
In [94]:
from pandas.plotting import scatter_matrix
scatter_matrix(filelist, alpha=0.8, figsize=(12, 12))
Out[94]:
In [95]:
filelist['prop'] = filelist['indentations'] / filelist['lines']
filelist.sort_values(by='prop', ascending=False).head()
Out[95]:
In [96]:
filelist['rel_prop'] = filelist['prop'] / filelist['prop'].max()
filelist.head()
Out[96]:
In [97]:
from matplotlib import cm
from matplotlib.colors import rgb2hex
plot_data = filelist.copy()
plot_data['color'] = [rgb2hex(x) for x in cm.coolwarm(filelist['rel_prop'])]
plot_data.head()
Out[97]:
In [176]:
import os
import json
json_data = {}
json_data['name'] = 'flare'
json_data['children'] = []
for row in plot_data.iterrows():
series = row[1]
path, filename = os.path.split(series['path'])
last_children = None
children = json_data['children']
for path_part in path.split("/"):
entry = None
for child in children:
if "name" in child and child["name"] == path_part:
entry = child
if not entry:
entry = {}
children.append(entry)
entry['name'] = path_part
if not 'children' in entry:
entry['children'] = []
children = entry['children']
last_children = children
last_children.append({
'name' : filename + "[l: {}, i: {}, p: {}".format(
series['lines'],
series['indentations'],
series['rel_prop']),
'size' : series['lines'],
'color' : series['color']})
with open ( "vis/flare.json", mode='w', encoding='utf-8') as json_file:
json_file.write(json.dumps(json_data, indent=3))
In [177]:
radial_data = plot_data[['path', 'lines']]
radial_data['path'] = radial_data['path'].str.replace("/", ".").str[1:]
#radial_data['path'] = radial_data[radial_data['path'].str.endswith("Test.java")]
radial_data['path'] = "dropover." + radial_data['path']
radial_data.head()
Out[177]:
In [178]:
radial_data = radial_data.loc[radial_data['path'].str.startswith("dropover.backend")]
radial_data = radial_data.loc[radial_data['path'].str.endswith(".java")]
radial_data = radial_data.loc[~radial_data['path'].str.contains("src.test")]
radial_data.head()
Out[178]:
In [179]:
data = []
for row in radial_data.iterrows():
series = row[1]
current_path = series['path']
current_lines = series['lines']
path_parts = current_path.replace(".java", "").split(".")
current_part = ""
for i, part in enumerate(path_parts):
current_part = current_part + "." + part
if i < len(path_parts)-1:
data.append(pd.Series((current_part[1:], "")))
else:
data.append(pd.Series((current_part[1:] + "", str(current_lines))))
data = pd.DataFrame(data).drop_duplicates()
data.to_csv("vis/flare.csv", index=None, header=['id', 'value'])
data
Out[179]: